****************************************************************************************************
*This file cleans data of Chinese imports from China at the state level of the UN Comtrade database*
****************************************************************************************************

*******************************************************************************************
*	Employment by states (SIC, from Autor, Dorn, Hanson, Acemoglu 2015) & Brendan Price   *
*******************************************************************************************

use "$contributed_data/cbp_state_1991.dta", clear

keep if year == 1991
rename sic87dd sic

rename year year_emp

*Combine sectors that are not in manufacturing in aggregate sectors
replace sic = 100 if sic <999
replace sic = 1000 if sic>=1000&sic<=1499
replace sic = 1500 if sic>=1500&sic<=1799
replace sic = 4000 if sic>=4000&sic<=4999
replace sic = 5000 if sic>=5000&sic<=5199
replace sic = 6000 if sic>=6000&sic<=6799
replace sic = 7000 if sic>=7000&sic<=8999
drop if sic>=9000 //only interested in employment from private sector afterwards (no public administration needed)

gen industry = "agriculture" if sic == 100
replace industry = "mining" if sic == 1000
replace industry = "construction" if sic == 1500
replace industry = "manufacturing" if sic >= 2000 & sic<=3999
replace industry = "utilities" if sic == 4000
replace industry = "wholesale" if sic == 5000
replace industry = "retail" if sic == 5200
replace industry = "finance" if sic == 6000
replace industry = "services" if sic == 7000

collapse (firstnm) year_emp industry (sum) emp, by(sic state)

gen sic87dd = sic

save "$clean_data_imports/employment_state_industry_final", replace

*Only MANUFACTURING sector
use "$clean_data_imports/employment_state_industry_final", clear
keep if sic>=2010 & sic<=3999

by sic, sort: gen nvals = _n == 1 
count if nvals
*392 industries
drop nvals

save "$clean_data_imports/employment_state_industry_final_manufacturing", replace

*Dataset extensions over all years between 1993 and 2017
use "$clean_data_imports/employment_state_industry_final", clear

*Need to expand employment dataset such that I have one obs for each sic, cz and year (repeated values of employment in 1990)
*for merge with import data by years for entire us

forval i=1993/2017 {
gen year`i' = emp
}

drop emp

reshape long year, i(state sic year_emp) j(emp)
rename (year emp) (employment1991 year)

save "$clean_data_imports/employment_state_industry_final_years", replace

************************
* Long-run differences
************************

use "$clean_data_imports/CHN_imports_dataset_d", clear

foreach g in "us" "iv" {
	foreach t in "1993_2000" "1993_2007" "1993_2014" "2000_2007" "2000_2014" "2007_2014" {
		local start = substr("`t'", 1, 4)
		local end = substr("`t'", 6, 4)

		* Load cleaned CZ-level data
		use "$clean_data_imports/employment_state_industry_final", clear

		* Compute industry employment shares
		bysort state year: egen state_emp = total(emp)
		gen empshare = emp/state_emp
		
		*Employment by industry over all CZ (total US)
		bysort sic87dd year: egen ind_emp = total(emp)
		
		drop emp state_emp

		* Merge in each industry's change in import exposure
		merge m:1 sic87dd using "$clean_data_imports/CHN_imports_dataset_d", assert(1 3) 
		assert sic87dd < 2000 | sic87dd > 3999 if _merge == 1
		

		* Define non-manufacturing changes in import exposure as zero
		replace d_`g'_imports_`t' = 0 if _merge == 1
		drop _merge
		
		*Change in imports (in 1'000 of 2017 USD) per worker
		replace d_`g'_imports_`t' = ((d_`g'_imports_`t'/1000)/ind_emp)
		
		* Calculate average changes in import exposure
		collapse (mean) d_state_imp_exp_`g'_`t' = d_`g'_imports_`t' [aw = empshare], by(state year)

		if "`g'" == "us" {
			label var d_state_imp_exp_`g'_`t' "Change in CZ's average import exposure, `start'-`end'"
		}
		else if "`g'" == "iv" {
			label var d_state_imp_exp_`g'_`t' "Instrument for change in CZ's average import exposure, `start'-`end'"
		}
		
		* Save the shocks
		tempfile d_state_imp_exp_`g'_`t'
		save "`d_state_imp_exp_`g'_`t''.dta", replace
	}
}

* Combine the various changes in CZ import exposure into a single file
use "`d_state_imp_exp_us_1993_2000'.dta", clear
merge 1:1 state using "`d_state_imp_exp_us_1993_2007'.dta", assert(3) nogenerate
merge 1:1 state using "`d_state_imp_exp_us_1993_2014'.dta", assert(3) nogenerate
merge 1:1 state using "`d_state_imp_exp_us_2000_2007'.dta", assert(3) nogenerate
merge 1:1 state using "`d_state_imp_exp_us_2000_2014'.dta", assert(3) nogenerate
merge 1:1 state using "`d_state_imp_exp_us_2007_2014'.dta", assert(3) nogenerate

merge 1:1 state using "`d_state_imp_exp_iv_1993_2000'.dta", assert(3) nogenerate
merge 1:1 state using "`d_state_imp_exp_iv_1993_2007'.dta", assert(3) nogenerate
merge 1:1 state using "`d_state_imp_exp_iv_1993_2014'.dta", assert(3) nogenerate
merge 1:1 state using "`d_state_imp_exp_iv_2000_2007'.dta", assert(3) nogenerate
merge 1:1 state using "`d_state_imp_exp_iv_2000_2014'.dta", assert(3) nogenerate
merge 1:1 state using "`d_state_imp_exp_iv_2007_2014'.dta", assert(3) nogenerate


compress
save "$final_data_imports/state_CHN_imports_dataset_d_long", replace


************************
* Stacked differences
************************

use "$final_data_imports/state_CHN_imports_dataset_d_long", clear

reshape long d_state_imp_exp_, i(state year_emp) j(temp) string
gen years = substr(temp, 4, 9)
gen country = substr(temp, 1, 2)
drop temp
gen start = substr(years, 1, 4)
gen end = substr(years, 6, 4)
destring start end, replace

save "`imports_stacked3''.dta", replace

foreach g in "us" "iv" {
	foreach t1 in "1993_2000" {
		foreach t2 in "2000_2007"{
			foreach t3 in "2007_2014" {
			use "`imports_stacked3''.dta", clear
			*keep if (years == "1993_2000" | years == "2000_2007" | years == "2007_2011") & country == "us"

			keep if (years == "`t1'" | years == "`t2'" | years == "`t3'") & country == "`g'"
			if end[1]==start[2] & end[2]==start[3] { //for example 1993-2000, 2000-2007, but not 1993-2000, 2007-2014
	
			egen group = group(state)
			gen constant = 1
			bysort group: gen period3 = sum(constant)
			
			local period1 = substr(years[1],1,9)
			local period2 = substr(years[2],5,5)
			local period3 = substr(years[3],5,5)
			local period `period1'`period2'`period3'
			rename d_state_imp_exp_ d3_exp_`g'_`period'

			keep state year_emp d3_exp_`g'_`period' period3
			tempfile d_exp_`g'_`period'
			save "`d_exp_`g'_`period''.dta", replace
		}
	}
	}
	}
}
use "`d_exp_us_1993_2000_2007_2014'.dta", clear

merge 1:1 state period3 using "`d_exp_iv_1993_2000_2007_2014'.dta", assert(3) nogenerate

compress
save "$final_data_imports/state_CHN_imports_dataset_d_stacked3", replace

*Merge together with measure for all sectors - LONG-RUN
use "$final_data_imports/state_CHN_imports_dataset_d_long", clear
*Rename such that consistent with automation variables
rename (d_state_imp*1993_2000* d_state_imp*1993_2007* d_state_imp*1993_2014* d_state_imp*2000_2007* d_state_imp*2000_2014* d_state_imp*2007_2014*) (d_state_imp*93_00* d_state_imp*93_07* d_state_imp*93_14* d_state_imp*00_07* d_state_imp*00_14* d_state_imp*07_14*)
save "$final_data_imports/state_ExposureChinaLong", replace


*3 Periods (1993-2000-2007-2014)
use "$final_data_imports/state_CHN_imports_dataset_d_stacked3", clear

gen year = 1990 if period3 == 1
replace year = 2000 if period3 == 2
replace year = 2008 if period3 == 3

rename (d3_exp_us_1993_2000_2007_2014 d3_exp_iv_1993_2000_2007_2014) (d_exp_state_us_stacked d_exp_state_iv_stacked)
keep state year d_exp_state_us_stacked d_exp_state_iv_stacked

destring statefip, replace

save "$final_data_imports/state_ExposureChina.dta", replace
cap rm "`imports_stacked3''.dta"
